library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(cluster)
library(clValid)
x<-read.csv('https://raw.githubusercontent.com/arikunco/machinelearning/master/dataset/online_retail_clean.csv')
print(summary(x))
## CustomerID recency frequency monetary
## Min. :12347 Min. : 21.47 Min. : 1.000 Min. :-38970.00
## 1st Qu.:13752 1st Qu.: 47.43 1st Qu.: 1.000 1st Qu.: 13.20
## Median :15249 Median : 92.39 Median : 1.000 Median : 26.10
## Mean :15270 Mean :133.05 Mean : 2.259 Mean : 52.63
## 3rd Qu.:16792 3rd Qu.:203.47 3rd Qu.: 2.000 3rd Qu.: 56.73
## Max. :18283 Max. :394.51 Max. :86.000 Max. : 10122.56
p <- plot_ly(x, x = ~recency, y = ~frequency, z = ~monetary) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Recency'),
yaxis = list(title = 'Frequency'),
zaxis = list(title = 'Monetary')))
p
x<-filter(x, monetary >= 0) %>% na.omit(x)
print(summary(x))
## CustomerID recency frequency monetary
## Min. :12347 Min. : 21.47 Min. : 1.000 Min. : 0.00
## 1st Qu.:13752 1st Qu.: 47.35 1st Qu.: 1.000 1st Qu.: 13.80
## Median :15252 Median : 92.22 Median : 1.000 Median : 27.09
## Mean :15270 Mean :132.11 Mean : 2.267 Mean : 71.20
## 3rd Qu.:16790 3rd Qu.:200.60 3rd Qu.: 2.000 3rd Qu.: 57.50
## Max. :18283 Max. :394.51 Max. :86.000 Max. :10122.56
x.scale <- as.data.frame(scale(x, scale = TRUE))
head(x.scale)
## CustomerID recency frequency monetary
## 1 -1.48786478 -1.062717 -0.07696631 -0.008361669
## 2 0.02366284 -1.062383 9.71540290 3.015655286
## 3 -0.47328562 -1.062210 0.21104455 -0.157988362
## 4 -1.58093490 -1.061669 0.49905541 0.158692626
## 5 -0.47615815 -1.061229 -0.36497716 -0.035219272
## 6 -0.79673300 -1.055713 0.21104455 -0.114150785
summary(x.scale)
## CustomerID recency frequency
## Min. :-1.679176 Min. :-1.0627 Min. :-0.36498
## 1st Qu.:-0.872281 1st Qu.:-0.8141 1st Qu.:-0.36498
## Median :-0.009946 Median :-0.3831 Median :-0.36498
## Mean : 0.000000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.873071 3rd Qu.: 0.6579 3rd Qu.:-0.07697
## Max. : 1.731097 Max. : 2.5204 Max. :24.11595
## monetary
## Min. :-0.21248
## 1st Qu.:-0.17128
## Median :-0.13162
## Mean : 0.00000
## 3rd Qu.:-0.04089
## Max. :29.99504
p <- plot_ly(x.scale, x = ~recency, y = ~frequency, z = ~monetary) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Recency'),
yaxis = list(title = 'Frequency'),
zaxis = list(title = 'Monetary')))
p
fviz_nbclust(x.scale[,2:4], kmeans, method = "wss") +
geom_vline(xintercept = 3, linetype = 2)+
labs(subtitle = "Elbow method")
fviz_nbclust(x.scale[,2:4], kmeans, method = "silhouette")+
labs(subtitle = "Silhouette method")
km.out <- kmeans(x.scale[,2:4], center=3, nstart=10)
x.scale$cluster <- km.out$cluster
x$cluster <- km.out$cluster
plot(x.scale[,2:4], col = x.scale$cluster,
main = "K-MEANS of RFM")
p <- plot_ly(x.scale, x = ~recency, y = ~frequency, z = ~monetary, color = ~cluster) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Recency'),
yaxis = list(title = 'Frequency'),
zaxis = list(title = 'Monetary')))
p
fviz_nbclust(x.scale[,2:4], pam, method = "silhouette")+
theme_classic()
pam.out<-pam(x.scale[,2:4], 2, metric = "euclidean", stand = FALSE)
x.scale$clusterpam <-pam.out$clustering
head(x.scale)
## CustomerID recency frequency monetary cluster clusterpam
## 1 -1.48786478 -1.062717 -0.07696631 -0.008361669 2 1
## 2 0.02366284 -1.062383 9.71540290 3.015655286 3 1
## 3 -0.47328562 -1.062210 0.21104455 -0.157988362 2 1
## 4 -1.58093490 -1.061669 0.49905541 0.158692626 2 1
## 5 -0.47615815 -1.061229 -0.36497716 -0.035219272 2 1
## 6 -0.79673300 -1.055713 0.21104455 -0.114150785 2 1
plot(x.scale[,2:4], col = x.scale$clusterpam,
main = "K-MEDOIDS of RFM")
p <- plot_ly(x.scale, x = ~recency, y = ~frequency, z = ~monetary, color = ~clusterpam) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Recency'),
yaxis = list(title = 'Frequency'),
zaxis = list(title = 'Monetary')))
p
res.dist <- dist(x.scale[,2:4], method = "euclidean")
res.hc <- hclust(d = res.dist, method = "complete")
summary(res.hc)
## Length Class Mode
## merge 4698 -none- numeric
## height 2349 -none- numeric
## order 2350 -none- numeric
## labels 2350 -none- character
## method 1 -none- character
## call 3 -none- call
## dist.method 1 -none- character
plot(res.hc)
clmethods <- c("hierarchical","kmeans","pam")
intr <- clValid(x.scale[,2:4], nClust = 2:6, clMethods = clmethods,validation = "internal" ,maxitems = 2350,metric = "euclidean",method = "complete")
summary(intr)
##
## Clustering Methods:
## hierarchical kmeans pam
##
## Cluster sizes:
## 2 3 4 5 6
##
## Validation Measures:
## 2 3 4 5 6
##
## hierarchical Connectivity 3.8579 10.4567 13.3151 22.8099 25.0266
## Dunn 0.5741 0.2713 0.2808 0.2056 0.2504
## Silhouette 0.9478 0.9077 0.8763 0.8311 0.8039
## kmeans Connectivity 14.4048 12.6524 65.1853 25.7480 102.0024
## Dunn 0.0611 0.1926 0.0005 0.0891 0.0017
## Silhouette 0.9111 0.9066 0.5582 0.7778 0.5400
## pam Connectivity 48.7032 79.6806 99.9222 156.5710 161.3635
## Dunn 0.0003 0.0005 0.0003 0.0003 0.0002
## Silhouette 0.5291 0.3844 0.4448 0.3302 0.3058
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 3.8579 hierarchical 2
## Dunn 0.5741 hierarchical 2
## Silhouette 0.9478 hierarchical 2
optimalScores(intr)
## Score Method Clusters
## Connectivity 3.8579365 hierarchical 2
## Dunn 0.5741036 hierarchical 2
## Silhouette 0.9477798 hierarchical 2
####Diketahui algoritma optimal adalah Hirarki Cluster dengan jumlah cluster = 2
hc.out <- cutree(res.hc, k=2)
x.scale$clusterhc <- hc.out
head(x.scale)
## CustomerID recency frequency monetary cluster clusterpam
## 1 -1.48786478 -1.062717 -0.07696631 -0.008361669 2 1
## 2 0.02366284 -1.062383 9.71540290 3.015655286 3 1
## 3 -0.47328562 -1.062210 0.21104455 -0.157988362 2 1
## 4 -1.58093490 -1.061669 0.49905541 0.158692626 2 1
## 5 -0.47615815 -1.061229 -0.36497716 -0.035219272 2 1
## 6 -0.79673300 -1.055713 0.21104455 -0.114150785 2 1
## clusterhc
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
plot(x.scale[,2:4], col = x.scale$clusterhc,
main = "HC of RFM")
p <- plot_ly(x.scale, x = ~recency, y = ~frequency, z = ~monetary, color = ~clusterhc) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Recency'),
yaxis = list(title = 'Frequency'),
zaxis = list(title = 'Monetary')))
p
x.scale2<-filter(x.scale, monetary <= 16)
intr <- clValid(x.scale2[,2:4], nClust = 2:6, clMethods = clmethods,validation = "internal", maxitems = 2350 ,metric = "euclidean",method = "complete")
## Warning in clValid(x.scale2[, 2:4], nClust = 2:6, clMethods = clmethods, :
## rownames for data not specified, using 1:nrow(data)
summary(intr)
##
## Clustering Methods:
## hierarchical kmeans pam
##
## Cluster sizes:
## 2 3 4 5 6
##
## Validation Measures:
## 2 3 4 5 6
##
## hierarchical Connectivity 6.5988 9.4571 18.9520 21.1687 29.8460
## Dunn 0.2713 0.2808 0.2056 0.2504 0.0409
## Silhouette 0.9077 0.8763 0.8311 0.8039 0.6298
## kmeans Connectivity 8.7944 61.3274 21.8901 98.1444 87.8675
## Dunn 0.1926 0.0005 0.0891 0.0017 0.0020
## Silhouette 0.9067 0.5580 0.7778 0.5398 0.5464
## pam Connectivity 48.2032 79.1806 99.4222 156.0710 160.8635
## Dunn 0.0004 0.0007 0.0004 0.0004 0.0003
## Silhouette 0.5391 0.4009 0.4555 0.3418 0.3174
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 6.5988 hierarchical 2
## Dunn 0.2808 hierarchical 3
## Silhouette 0.9077 hierarchical 2
optimalScores(intr)
## Score Method Clusters
## Connectivity 6.5988095 hierarchical 2
## Dunn 0.2808187 hierarchical 3
## Silhouette 0.9077082 hierarchical 2
res.dist <- dist(x.scale2[,2:4], method = "euclidean")
res.hc <- hclust(d = res.dist, method = "complete")
hc.out <- cutree(res.hc, k=3)
x.scale2$clusterhc <- hc.out
head(x.scale2)
## CustomerID recency frequency monetary cluster clusterpam
## 1 -1.48786478 -1.062717 -0.07696631 -0.008361669 2 1
## 2 0.02366284 -1.062383 9.71540290 3.015655286 3 1
## 3 -0.47328562 -1.062210 0.21104455 -0.157988362 2 1
## 4 -1.58093490 -1.061669 0.49905541 0.158692626 2 1
## 5 -0.47615815 -1.061229 -0.36497716 -0.035219272 2 1
## 6 -0.79673300 -1.055713 0.21104455 -0.114150785 2 1
## clusterhc
## 1 1
## 2 2
## 3 1
## 4 1
## 5 1
## 6 1
plot(x.scale2[,2:4], col = x.scale$clusterhc,
main = "HC of RFM")
p <- plot_ly(x.scale2, x = ~recency, y = ~frequency, z = ~monetary, color = ~clusterhc) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Recency'),
yaxis = list(title = 'Frequency'),
zaxis = list(title = 'Monetary')))
p